In [1]:
import pandas as pd
import os

# Specify the directory where your CSV files are located
directory = "data"

# List to store all DataFrames
all_dataframes = []

# Loop through each year (2018 to 2025)
for year in range(2018, 2026):  # 2026 to include 2025
    # Construct the filename (e.g., "SR2018.csv", "SR2019.csv", etc.)
    filename = f"SR{year}.csv"
    file_path = os.path.join(directory, filename)

    # Check if the file exists
    if os.path.exists(file_path):
        try:
            # Try reading with UTF-8 first, then fall back to Latin-1 if UTF-8 fails
            try:
                df = pd.read_csv(file_path, encoding='utf-8', on_bad_lines='skip')
            except UnicodeDecodeError:
                print(f"UTF-8 failed for {filename}. Trying Latin-1 encoding...")
                df = pd.read_csv(file_path, encoding='latin1', on_bad_lines='skip')
            
            # Optionally, add a 'Year' column to track the origin
            df['Year'] = year
            all_dataframes.append(df)
            print(f"Successfully loaded {filename}")
        except Exception as e:
            print(f"Error loading {filename}: {e}")
    else:
        print(f"File {filename} not found")

# Concatenate all DataFrames into one
if all_dataframes:
    combined_df = pd.concat(all_dataframes, ignore_index=True)
    
    # Save the combined DataFrame to a new CSV file in the 'data' folder
    output_file = os.path.join(directory, "SR_all_years.csv")
    combined_df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"Combined CSV saved as {output_file}")
else:
    print("No files were loaded. Check file paths and names.")

# Optional: Display the first few rows of the combined data
print("\nPreview of combined data:")
print(combined_df.head())
Successfully loaded SR2018.csv
Successfully loaded SR2019.csv
Successfully loaded SR2020.csv
Successfully loaded SR2021.csv
Successfully loaded SR2022.csv
Successfully loaded SR2023.csv
Successfully loaded SR2024.csv
UTF-8 failed for SR2025.csv. Trying Latin-1 encoding...
Successfully loaded SR2025.csv
Combined CSV saved as data/SR_all_years.csv

Preview of combined data:
                 Creation Date        Status First 3 Chars of Postal Code  \
0  2018-01-01 00:06:06.0000000        Closed                          M1N   
1  2018-01-01 00:14:04.0000000        Closed                          M2M   
2  2018-01-01 00:17:19.0000000        Closed                          M8V   
3  2018-01-01 00:24:35.0000000        Closed                          M1K   
4  2018-01-01 00:28:44.0000000  In-progress                           M4L   

  Intersection Street 1 Intersection Street 2                        Ward  \
0                   NaN                   NaN  Scarborough Southwest (20)   
1                   NaN                   NaN             Willowdale (18)   
2                   NaN                   NaN    Etobicoke-Lakeshore (03)   
3                   NaN                   NaN     Scarborough Centre (21)   
4                   NaN                   NaN      Beaches-East York (19)   

          Service Request Type                         Division  \
0                        Noise  Municipal Licensing & Standards   
1                        Noise  Municipal Licensing & Standards   
2          INJUR/DIST WILDLIFE  Municipal Licensing & Standards   
3     Watermain-Possible Break                    Toronto Water   
4  Water Service Line-No Water                    Toronto Water   

                   Section  Year  
0     District Enforcement  2018  
1     District Enforcement  2018  
2  Toronto Animal Services  2018  
3             District Ops  2018  
4             District Ops  2018  
In [2]:
# Load and inspect the combined CSV

# Load the combined CSV
combined_df = pd.read_csv("data/SR_all_years.csv")

# Display basic info
print("Combined DataFrame Info:")
print(combined_df.info())

# Check rows per year
print("\nRows per year:")
print(combined_df['Year'].value_counts().sort_index())

# Check missing values
print("\nMissing values per column:")
print(combined_df.isnull().sum())
Combined DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2771056 entries, 0 to 2771055
Data columns (total 10 columns):
 #   Column                        Dtype 
---  ------                        ----- 
 0   Creation Date                 object
 1   Status                        object
 2   First 3 Chars of Postal Code  object
 3   Intersection Street 1         object
 4   Intersection Street 2         object
 5   Ward                          object
 6   Service Request Type          object
 7   Division                      object
 8   Section                       object
 9   Year                          int64 
dtypes: int64(1), object(9)
memory usage: 211.4+ MB
None

Rows per year:
Year
2018    412095
2019    438914
2020    347748
2021    323880
2022    427903
2023    385975
2024    402824
2025     31717
Name: count, dtype: int64

Missing values per column:
Creation Date                         0
Status                                0
First 3 Chars of Postal Code          0
Intersection Street 1           2383434
Intersection Street 2           2385286
Ward                                  0
Service Request Type                  0
Division                              0
Section                               0
Year                                  0
dtype: int64
In [3]:
print (combined_df['Ward'].value_counts().sort_index())
Ward
Beaches-East York (19)           150380
Davenport (09)                   146529
Don Valley East (16)              63918
Don Valley North (17)             77239
Don Valley West (15)             112122
Eglinton-Lawrence (08)           142718
Etobicoke Centre (02)            122626
Etobicoke North (01)              86647
Etobicoke-Lakeshore (03)         158161
Humber River-Black Creek (07)     75289
Parkdale-High Park (04)          141563
Scarborough Centre (21)          105139
Scarborough North (23)            66251
Scarborough Southwest (20)       129584
Scarborough-Agincourt (22)        67949
Scarborough-Guildwood (24)        85788
Scarborough-Rouge Park (25)      105995
Spadina-Fort York (10)           106685
Toronto Centre (13)               93312
Toronto-Danforth (14)            162181
Toronto-St. Paul's (12)          123903
University-Rosedale (11)         147825
Unknown                              25
Willowdale (18)                   83609
York Centre (06)                  96189
York South-Weston (05)           119429
Name: count, dtype: int64
In [4]:
# Cleaning the data

# Convert Creation Date to datetime
combined_df['Creation Date'] = pd.to_datetime(combined_df['Creation Date'], errors='coerce')

# Extract time-based features
combined_df['Month'] = combined_df['Creation Date'].dt.month
combined_df['DayOfWeek'] = combined_df['Creation Date'].dt.dayofweek  # 0 = Monday, 6 = Sunday
combined_df['Hour'] = combined_df['Creation Date'].dt.hour

# Create Season feature
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'
combined_df['Season'] = combined_df['Month'].apply(get_season)

# Drop columns with excessive missing values
combined_df = combined_df.drop(columns=['Intersection Street 1', 'Intersection Street 2'])

# Check for any invalid datetime conversions
print("Rows with invalid Creation Date:", combined_df['Creation Date'].isna().sum())

# Preview cleaned data
print("\nCleaned DataFrame Info:")
print(combined_df.info())
print("\nPreview of cleaned data:")
print(combined_df.head())

# Save cleaned data
cleaned_file = "data/SR_all_years_cleaned.csv"
combined_df.to_csv(cleaned_file, index=False, encoding='utf-8')
print(f"Cleaned data saved as {cleaned_file}")
Rows with invalid Creation Date: 0

Cleaned DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2771056 entries, 0 to 2771055
Data columns (total 12 columns):
 #   Column                        Dtype         
---  ------                        -----         
 0   Creation Date                 datetime64[ns]
 1   Status                        object        
 2   First 3 Chars of Postal Code  object        
 3   Ward                          object        
 4   Service Request Type          object        
 5   Division                      object        
 6   Section                       object        
 7   Year                          int64         
 8   Month                         int32         
 9   DayOfWeek                     int32         
 10  Hour                          int32         
 11  Season                        object        
dtypes: datetime64[ns](1), int32(3), int64(1), object(7)
memory usage: 222.0+ MB
None

Preview of cleaned data:
        Creation Date        Status First 3 Chars of Postal Code  \
0 2018-01-01 00:06:06        Closed                          M1N   
1 2018-01-01 00:14:04        Closed                          M2M   
2 2018-01-01 00:17:19        Closed                          M8V   
3 2018-01-01 00:24:35        Closed                          M1K   
4 2018-01-01 00:28:44  In-progress                           M4L   

                         Ward         Service Request Type  \
0  Scarborough Southwest (20)                        Noise   
1             Willowdale (18)                        Noise   
2    Etobicoke-Lakeshore (03)          INJUR/DIST WILDLIFE   
3     Scarborough Centre (21)     Watermain-Possible Break   
4      Beaches-East York (19)  Water Service Line-No Water   

                          Division                  Section  Year  Month  \
0  Municipal Licensing & Standards     District Enforcement  2018      1   
1  Municipal Licensing & Standards     District Enforcement  2018      1   
2  Municipal Licensing & Standards  Toronto Animal Services  2018      1   
3                    Toronto Water             District Ops  2018      1   
4                    Toronto Water             District Ops  2018      1   

   DayOfWeek  Hour  Season  
0          0     0  Winter  
1          0     0  Winter  
2          0     0  Winter  
3          0     0  Winter  
4          0     0  Winter  
Cleaned data saved as data/SR_all_years_cleaned.csv
In [5]:
# Exploratory Data Analysis (EDA)
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Request Volume by Year
plt.figure(figsize=(10, 6))
combined_df['Year'].value_counts().sort_index().plot(kind='bar', color='skyblue')
plt.title('Service Request Volume by Year (2018-2025)')
plt.xlabel('Year')
plt.ylabel('Number of Requests')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 2. Request Volume by Season
plt.figure(figsize=(10, 6))
sns.countplot(data=combined_df, x='Season', order=['Winter', 'Spring', 'Summer', 'Fall'], palette='viridis')
plt.title('Service Request Volume by Season')
plt.xlabel('Season')
plt.ylabel('Number of Requests')
plt.tight_layout()
plt.show()

# 3. Top 10 Service Request Types
plt.figure(figsize=(12, 6))
top_types = combined_df['Service Request Type'].value_counts().head(10)
sns.barplot(x=top_types.values, y=top_types.index, palette='muted')
plt.title('Top 10 Service Request Types (2018-2025)')
plt.xlabel('Number of Requests')
plt.ylabel('Service Request Type')
plt.tight_layout()
plt.show()

# 4. Summary Statistics
print("\nSummary Statistics:")
print(combined_df.describe(include='all'))
No description has been provided for this image
/var/folders/07/3vs0pyfj68xd69bt_2bj8ln40000gn/T/ipykernel_3516/577754450.py:17: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(data=combined_df, x='Season', order=['Winter', 'Spring', 'Summer', 'Fall'], palette='viridis')
No description has been provided for this image
/var/folders/07/3vs0pyfj68xd69bt_2bj8ln40000gn/T/ipykernel_3516/577754450.py:27: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_types.values, y=top_types.index, palette='muted')
No description has been provided for this image
Summary Statistics:
                        Creation Date   Status First 3 Chars of Postal Code  \
count                         2771056  2771056                      2771056   
unique                            NaN        9                          101   
top                               NaN   Closed                 Intersection   
freq                              NaN  1274923                       387622   
mean    2021-07-04 19:42:56.926410752      NaN                          NaN   
min               2018-01-01 00:06:06      NaN                          NaN   
25%     2019-08-07 11:38:18.249999872      NaN                          NaN   
50%        2021-07-06 11:27:45.500000      NaN                          NaN   
75%               2023-04-24 12:59:02      NaN                          NaN   
max               2025-01-31 23:56:11      NaN                          NaN   
std                               NaN      NaN                          NaN   

                         Ward                     Service Request Type  \
count                 2771056                                  2771056   
unique                     26                                      852   
top     Toronto-Danforth (14)  Residential: Bin: Repair or Replace Lid   
freq                   162181                                   112751   
mean                      NaN                                      NaN   
min                       NaN                                      NaN   
25%                       NaN                                      NaN   
50%                       NaN                                      NaN   
75%                       NaN                                      NaN   
max                       NaN                                      NaN   
std                       NaN                                      NaN   

                               Division      Section          Year  \
count                           2771056      2771056  2.771056e+06   
unique                                9           30           NaN   
top     Solid Waste Management Services  Collections           NaN   
freq                            1028897      1023003           NaN   
mean                                NaN          NaN  2.021026e+03   
min                                 NaN          NaN  2.018000e+03   
25%                                 NaN          NaN  2.019000e+03   
50%                                 NaN          NaN  2.021000e+03   
75%                                 NaN          NaN  2.023000e+03   
max                                 NaN          NaN  2.025000e+03   
std                                 NaN          NaN  2.073598e+00   

               Month     DayOfWeek          Hour   Season  
count   2.771056e+06  2.771056e+06  2.771056e+06  2771056  
unique           NaN           NaN           NaN        4  
top              NaN           NaN           NaN   Summer  
freq             NaN           NaN           NaN   811628  
mean    6.289808e+00  2.566666e+00  1.321489e+01      NaN  
min     1.000000e+00  0.000000e+00  0.000000e+00      NaN  
25%     4.000000e+00  1.000000e+00  1.000000e+01      NaN  
50%     6.000000e+00  2.000000e+00  1.300000e+01      NaN  
75%     9.000000e+00  4.000000e+00  1.600000e+01      NaN  
max     1.200000e+01  6.000000e+00  2.300000e+01      NaN  
std     3.292783e+00  1.772963e+00  4.314874e+00      NaN  
In [6]:
# Histograms for Request Patterns by Year and Season

# 1. Histogram of Request Volume by Year
plt.figure(figsize=(10, 6))
plt.hist(combined_df['Year'], bins=range(2018, 2027), edgecolor='black', color='lightblue')
plt.title('Histogram of Service Request Volume by Year (2018-2025)')
plt.xlabel('Year')
plt.ylabel('Number of Requests')
plt.xticks(range(2018, 2026))
plt.tight_layout()
plt.show()

# 2. Histogram of Top Request Types by Season
top_requests = combined_df['Service Request Type'].value_counts().head(5).index  # Top 5 request types
season_request_df = combined_df[combined_df['Service Request Type'].isin(top_requests)]

plt.figure(figsize=(12, 8))
for season in ['Winter', 'Spring', 'Summer', 'Fall']:
    season_data = season_request_df[season_request_df['Season'] == season]
    plt.hist(season_data['Service Request Type'], bins=5, alpha=0.5, label=season, edgecolor='black')

plt.title('Histogram of Top 5 Service Request Types by Season')
plt.xlabel('Service Request Type')
plt.ylabel('Number of Requests')
plt.legend(title='Season')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 3. Summary of Top Requests by Season
print("\nTop 5 Service Request Types by Season:")
for season in ['Winter', 'Spring', 'Summer', 'Fall']:
    season_counts = (combined_df[combined_df['Season'] == season]['Service Request Type']
                     .value_counts().head(5))
    print(f"\n{season}:")
    print(season_counts)
No description has been provided for this image
No description has been provided for this image
Top 5 Service Request Types by Season:

Winter:
Service Request Type
Road - Pot hole                            23941
Residential: Bin: Repair or Replace Lid    21916
Publication Request - Solid Waste          20652
Property Standards                         18785
Res / Garbage / Not Picked Up              15227
Name: count, dtype: int64

Spring:
Service Request Type
Road - Pot hole                                    35207
Residential: Bin: Repair or Replace Lid            28035
Property Standards                                 21335
Residential Furniture / Not Picked Up              13681
Residential: Bin: Repair or Replace Body/Handle    13573
Name: count, dtype: int64

Summer:
Service Request Type
Residential: Bin: Repair or Replace Lid    33857
Property Standards                         29675
General Pruning                            27111
Long Grass and Weeds                       22751
CADAVER WILDLIFE                           18964
Name: count, dtype: int64

Fall:
Service Request Type
Residential: Bin: Repair or Replace Lid    28943
Property Standards                         24650
CADAVER WILDLIFE                           16031
Residential Furniture / Not Picked Up      14644
Cadaver - Wildlife                         14287
Name: count, dtype: int64
In [7]:
# Heatmap of Top Request Types by Season
# Create a pivot table for the heatmap
pivot_table = pd.crosstab(combined_df['Season'], 
                          combined_df['Service Request Type'], 
                          values=combined_df['Service Request Type'].count(), 
                          aggfunc='count').loc[['Winter', 'Spring', 'Summer', 'Fall'], top_requests]

plt.figure(figsize=(12, 6))
sns.heatmap(pivot_table, annot=True, fmt='.0f', cmap='YlGnBu', cbar_kws={'label': 'Number of Requests'})
plt.title('Heatmap of Top 5 Service Request Types by Season')
plt.xlabel('Service Request Type')
plt.ylabel('Season')
plt.tight_layout()
plt.show()

# 4. Summary of Top Requests by Season
print("\nTop 5 Service Request Types by Season:")
for season in ['Winter', 'Spring', 'Summer', 'Fall']:
    season_counts = (combined_df[combined_df['Season'] == season]['Service Request Type']
                     .value_counts().head(5))
    print(f"\n{season}:")
    print(season_counts)
No description has been provided for this image
Top 5 Service Request Types by Season:

Winter:
Service Request Type
Road - Pot hole                            23941
Residential: Bin: Repair or Replace Lid    21916
Publication Request - Solid Waste          20652
Property Standards                         18785
Res / Garbage / Not Picked Up              15227
Name: count, dtype: int64

Spring:
Service Request Type
Road - Pot hole                                    35207
Residential: Bin: Repair or Replace Lid            28035
Property Standards                                 21335
Residential Furniture / Not Picked Up              13681
Residential: Bin: Repair or Replace Body/Handle    13573
Name: count, dtype: int64

Summer:
Service Request Type
Residential: Bin: Repair or Replace Lid    33857
Property Standards                         29675
General Pruning                            27111
Long Grass and Weeds                       22751
CADAVER WILDLIFE                           18964
Name: count, dtype: int64

Fall:
Service Request Type
Residential: Bin: Repair or Replace Lid    28943
Property Standards                         24650
CADAVER WILDLIFE                           16031
Residential Furniture / Not Picked Up      14644
Cadaver - Wildlife                         14287
Name: count, dtype: int64
In [8]:
# Historical Data by Location (Ward)

# 1. Total Requests per Ward
ward_counts = combined_df['Ward'].value_counts()
print("\nTotal Service Requests by Ward (2018-2025):")
print(ward_counts)

# 2. Heatmap of Request Volume by Ward and Year
ward_year_pivot = pd.crosstab(combined_df['Ward'], combined_df['Year'])
plt.figure(figsize=(12, 10))
sns.heatmap(ward_year_pivot, annot=True, fmt='.0f', cmap='YlGnBu', 
            cbar_kws={'label': 'Number of Requests'})
plt.title('Heatmap of Service Request Volume by Ward and Year (2018-2025)')
plt.xlabel('Year')
plt.ylabel('Ward')
plt.tight_layout()
plt.show()

# 3. Top 5 Request Types per Ward
print("\nTop 5 Service Request Types by Ward:")
for ward in ward_counts.index[:5]:  # Top 5 wards by request volume
    ward_requests = (combined_df[combined_df['Ward'] == ward]['Service Request Type']
                     .value_counts().head(5))
    print(f"\n{ward}:")
    print(ward_requests)
Total Service Requests by Ward (2018-2025):
Ward
Toronto-Danforth (14)            162181
Etobicoke-Lakeshore (03)         158161
Beaches-East York (19)           150380
University-Rosedale (11)         147825
Davenport (09)                   146529
Eglinton-Lawrence (08)           142718
Parkdale-High Park (04)          141563
Scarborough Southwest (20)       129584
Toronto-St. Paul's (12)          123903
Etobicoke Centre (02)            122626
York South-Weston (05)           119429
Don Valley West (15)             112122
Spadina-Fort York (10)           106685
Scarborough-Rouge Park (25)      105995
Scarborough Centre (21)          105139
York Centre (06)                  96189
Toronto Centre (13)               93312
Etobicoke North (01)              86647
Scarborough-Guildwood (24)        85788
Willowdale (18)                   83609
Don Valley North (17)             77239
Humber River-Black Creek (07)     75289
Scarborough-Agincourt (22)        67949
Scarborough North (23)            66251
Don Valley East (16)              63918
Unknown                              25
Name: count, dtype: int64
No description has been provided for this image
Top 5 Service Request Types by Ward:

Toronto-Danforth (14):
Service Request Type
Residential: Bin: Repair or Replace Lid    4933
Property Standards                         4883
Road - Pot hole                            4406
Sewer Service Line-Blocked                 4288
Res / Garbage / Not Picked Up              4244
Name: count, dtype: int64

Etobicoke-Lakeshore (03):
Service Request Type
Residential: Bin: Repair or Replace Lid    4872
Property Standards                         4758
Res / Garbage / Not Picked Up              4666
Road - Pot hole                            4315
Res / Recycle / Not Picked Up              4258
Name: count, dtype: int64

Beaches-East York (19):
Service Request Type
Property Standards                         5634
Residential: Bin: Repair or Replace Lid    4575
Res / Garbage / Not Picked Up              4525
Injured - Wildlife                         4157
Sewer Service Line-Blocked                 3836
Name: count, dtype: int64

University-Rosedale (11):
Service Request Type
Residential: Bin: Repair or Replace Lid            6395
Property Standards                                 4801
Road - Pot hole                                    4128
Residential: Bin: Repair or Replace Body/Handle    3090
Residential: Garbage Bin: Missing                  3042
Name: count, dtype: int64

Davenport (09):
Service Request Type
Residential: Bin: Repair or Replace Lid            8876
Property Standards                                 5094
Residential: Bin: Repair or Replace Body/Handle    4711
Residential: Recycle Bin: Missing                  3332
Residential: Garbage Bin: Missing                  3332
Name: count, dtype: int64
In [9]:
# Final Modeling with Season Focus
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, f1_score
import matplotlib.pyplot as plt
import numpy as np

# Prepare data (filter out 2025)
model_df = combined_df[combined_df['Year'] != 2025].copy()

# 1. Volume Prediction: Random Forest
volume_df = model_df.groupby(['Year', 'Month', 'Season', 'Ward']).size().reset_index(name='Request_Count')
features_vol = ['Year', 'Month', 'Season', 'Ward']
for col in ['Season', 'Ward']:
    volume_df[col] = LabelEncoder().fit_transform(volume_df[col])

X_vol = volume_df[features_vol]
y_vol = volume_df['Request_Count']
y_vol_log = np.log1p(y_vol)  # Normalize
X_train_vol, X_test_vol, y_train_vol, y_test_vol = train_test_split(X_vol, y_vol_log, 
                                                                    test_size=0.3, random_state=42)

# Random Forest Regressor
rf_reg_model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
rf_reg_model.fit(X_train_vol, y_train_vol)
y_pred_rf = rf_reg_model.predict(X_test_vol)

# Evaluate
y_pred_rf_exp = np.expm1(y_pred_rf)
y_test_vol_exp = np.expm1(y_test_vol)
print("\nRandom Forest Regressor (Volume Prediction by Season/Ward/Month):")
print(f"R² Score: {r2_score(y_test_vol_exp, y_pred_rf_exp):.4f}")
print(f"Mean Squared Error: {mean_squared_error(y_test_vol_exp, y_pred_rf_exp):.4f}")

# Plot
plt.figure(figsize=(10, 6))
plt.scatter(y_test_vol_exp, y_pred_rf_exp, alpha=0.5, color='purple')
plt.plot([y_test_vol_exp.min(), y_test_vol_exp.max()], [y_test_vol_exp.min(), y_test_vol_exp.max()], 'r--')
plt.title
plt.title('Predicted vs Actual Request Volumes (Random Forest)')
plt.xlabel('Actual Request Count')
plt.ylabel('Predicted Request Count')
plt.tight_layout()
plt.show()

# 2. Status Prediction: Random Forest
features_stat = ['Season', 'Service Request Type', 'Division', 'Ward']
for col in features_stat:
    model_df[col] = LabelEncoder().fit_transform(model_df[col])

model_df['Status'] = model_df['Status'].apply(lambda x: 1 if x == 'Closed' else 0)
print("\nClass Distribution (Status):")
print(model_df['Status'].value_counts(normalize=True))

X_stat = model_df[features_stat]
y_stat = model_df['Status']
X_train_stat, X_test_stat, y_train_stat, y_test_stat = train_test_split(X_stat, y_stat, 
                                                                        test_size=0.3, random_state=42)

# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42, n_jobs=-1)
rf_model.fit(X_train_stat, y_train_stat)
y_pred_stat = rf_model.predict(X_test_stat)

# Evaluate
print("\nRandom Forest (Status Prediction by Season):")
print(f"Accuracy: {accuracy_score(y_test_stat, y_pred_stat):.4f}")
print(f"F1 Score: {f1_score(y_test_stat, y_pred_stat):.4f}")

# Feature importance
feat_importance = pd.Series(rf_model.feature_importances_, index=features_stat).sort_values(ascending=False)
print("\nFeature Importance (Random Forest):")
print(feat_importance)
Random Forest Regressor (Volume Prediction by Season/Ward/Month):
R² Score: 0.7490
Mean Squared Error: 53492.1904
No description has been provided for this image
Class Distribution (Status):
Status
0    0.534693
1    0.465307
Name: proportion, dtype: float64

Random Forest (Status Prediction by Season):
Accuracy: 0.6470
F1 Score: 0.6557

Feature Importance (Random Forest):
Service Request Type    0.695665
Division                0.231902
Ward                    0.049178
Season                  0.023256
dtype: float64
In [10]:
from sklearn.model_selection import GridSearchCV

# Example: Tuning RandomForestRegressor
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5)
grid_search.fit(X_train_vol, y_train_vol)
print("Best parameters:", grid_search.best_params_)
Best parameters: {'max_depth': 15, 'min_samples_split': 2, 'n_estimators': 200}
In [11]:
!pip install folium geopy pandas
Requirement already satisfied: folium in /opt/anaconda3/lib/python3.12/site-packages (0.19.5)
Requirement already satisfied: geopy in /opt/anaconda3/lib/python3.12/site-packages (2.4.1)
Requirement already satisfied: pandas in /opt/anaconda3/lib/python3.12/site-packages (2.2.2)
Requirement already satisfied: branca>=0.6.0 in /opt/anaconda3/lib/python3.12/site-packages (from folium) (0.8.1)
Requirement already satisfied: jinja2>=2.9 in /opt/anaconda3/lib/python3.12/site-packages (from folium) (3.1.4)
Requirement already satisfied: numpy in /opt/anaconda3/lib/python3.12/site-packages (from folium) (1.26.4)
Requirement already satisfied: requests in /opt/anaconda3/lib/python3.12/site-packages (from folium) (2.32.3)
Requirement already satisfied: xyzservices in /opt/anaconda3/lib/python3.12/site-packages (from folium) (2022.9.0)
Requirement already satisfied: geographiclib<3,>=1.52 in /opt/anaconda3/lib/python3.12/site-packages (from geopy) (2.0)
Requirement already satisfied: python-dateutil>=2.8.2 in /opt/anaconda3/lib/python3.12/site-packages (from pandas) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in /opt/anaconda3/lib/python3.12/site-packages (from pandas) (2024.1)
Requirement already satisfied: tzdata>=2022.7 in /opt/anaconda3/lib/python3.12/site-packages (from pandas) (2023.3)
Requirement already satisfied: MarkupSafe>=2.0 in /opt/anaconda3/lib/python3.12/site-packages (from jinja2>=2.9->folium) (2.1.3)
Requirement already satisfied: six>=1.5 in /opt/anaconda3/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)
Requirement already satisfied: charset-normalizer<4,>=2 in /opt/anaconda3/lib/python3.12/site-packages (from requests->folium) (3.3.2)
Requirement already satisfied: idna<4,>=2.5 in /opt/anaconda3/lib/python3.12/site-packages (from requests->folium) (3.7)
Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/anaconda3/lib/python3.12/site-packages (from requests->folium) (2.2.3)
Requirement already satisfied: certifi>=2017.4.17 in /opt/anaconda3/lib/python3.12/site-packages (from requests->folium) (2024.8.30)
In [12]:
!pip install geopandas
Requirement already satisfied: geopandas in /opt/anaconda3/lib/python3.12/site-packages (1.0.1)
Requirement already satisfied: numpy>=1.22 in /opt/anaconda3/lib/python3.12/site-packages (from geopandas) (1.26.4)
Requirement already satisfied: pyogrio>=0.7.2 in /opt/anaconda3/lib/python3.12/site-packages (from geopandas) (0.10.0)
Requirement already satisfied: packaging in /opt/anaconda3/lib/python3.12/site-packages (from geopandas) (24.1)
Requirement already satisfied: pandas>=1.4.0 in /opt/anaconda3/lib/python3.12/site-packages (from geopandas) (2.2.2)
Requirement already satisfied: pyproj>=3.3.0 in /opt/anaconda3/lib/python3.12/site-packages (from geopandas) (3.7.1)
Requirement already satisfied: shapely>=2.0.0 in /opt/anaconda3/lib/python3.12/site-packages (from geopandas) (2.0.7)
Requirement already satisfied: python-dateutil>=2.8.2 in /opt/anaconda3/lib/python3.12/site-packages (from pandas>=1.4.0->geopandas) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in /opt/anaconda3/lib/python3.12/site-packages (from pandas>=1.4.0->geopandas) (2024.1)
Requirement already satisfied: tzdata>=2022.7 in /opt/anaconda3/lib/python3.12/site-packages (from pandas>=1.4.0->geopandas) (2023.3)
Requirement already satisfied: certifi in /opt/anaconda3/lib/python3.12/site-packages (from pyogrio>=0.7.2->geopandas) (2024.8.30)
Requirement already satisfied: six>=1.5 in /opt/anaconda3/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas>=1.4.0->geopandas) (1.16.0)
In [13]:
import os

# Print the current working directory
print("Current working directory:", os.getcwd())

# List the contents of the 'data' folder
data_folder = 'data'
if os.path.exists(data_folder):
    print("Contents of the 'data' folder:", os.listdir(data_folder))
else:
    print("The 'data' folder does not exist in the current working directory.")
Current working directory: /Users/priyankmali/Final Project EECS 1516 
Contents of the 'data' folder: ['25-ward-model', '25-ward-model-december-2018-wgs84-latitude-longitude.zip', 'SR_all_years_cleaned.csv', 'SR_train_with_weather.csv', '.ipynb_checkpoints', 'SR2021.csv', 'SR2020.csv', 'SR2022.csv', 'SR2023.csv', 'SR_all_years.csv', 'SR2018.csv', 'SR2024.csv', 'SR2025.csv', 'SR2019.csv']
In [14]:
import zipfile

# Path to the ZIP file
zip_path = 'data/25-ward-model-december-2018-wgs84-latitude-longitude.zip'

# List the contents of the ZIP file without extracting
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_contents = zip_ref.namelist()
    print("Contents of the ZIP file:", zip_contents)
Contents of the ZIP file: ['WARD_WGS84.cpg', 'WARD_WGS84.dbf', 'WARD_WGS84.prj', 'WARD_WGS84.sbn', 'WARD_WGS84.sbx', 'WARD_WGS84.shp', 'WARD_WGS84.shp.xml', 'WARD_WGS84.shx', 'WARD_WGS84_readme.txt']
In [15]:
import folium
import pandas as pd
import geopandas as gpd
from folium.plugins import HeatMap
import zipfile
import os
from IPython.display import display  # Import display for inline rendering

# Toronto ward coordinates (latitude, longitude) - keeping this for reference
toronto_ward_coords = {
    'Toronto-Danforth (14)': [43.6785, -79.3526],
    'Etobicoke-Lakeshore (03)': [43.6216, -79.4996],
    'Beaches-East York (19)': [43.6938, -79.2989],
    'University-Rosedale (11)': [43.6659, -79.4009],
    'Davenport (09)': [43.6690, -79.4611],
    'Eglinton-Lawrence (08)': [43.7110, -79.4350],
    'Parkdale-High Park (04)': [43.6452, -79.4496],
    'Scarborough Southwest (20)': [43.7299, -79.2569],
    'Toronto-St. Paul\'s (12)': [43.6895, -79.4083],
    'Etobicoke Centre (02)': [43.6488, -79.5362],
    'Don Valley East (16)': [43.7412, -79.3473],
    'Don Valley North (17)': [43.7931, -79.3624],
    'Don Valley West (15)': [43.7126, -79.3742],
    'Etobicoke North (01)': [43.7224, -79.5678],
    'Humber River-Black Creek (07)': [43.7589, -79.5136],
    'Scarborough Centre (21)': [43.7766, -79.2579],
    'Scarborough North (23)': [43.8345, -79.2626],
    'Scarborough-Agincourt (22)': [43.7864, -79.2917],
    'Scarborough-Guildwood (24)': [43.7631, -79.1887],
    'Scarborough-Rouge Park (25)': [43.8026, -79.1872],
    'Spadina-Fort York (10)': [43.6396, -79.4029],
    'Toronto Centre (13)': [43.6532, -79.3832],
    'Willowdale (18)': [43.7744, -79.4056],
    'York Centre (06)': [43.7462, -79.4563],
    'York South-Weston (05)': [43.6948, -79.5026]
}

# 1. Verify the 'data' folder and file
data_folder = 'data'
zip_path = 'data/25-ward-model-december-2018-wgs84-latitude-longitude.zip'

# Print the current working directory and contents of the 'data' folder
print("Current working directory:", os.getcwd())
if os.path.exists(data_folder):
    print("Contents of the 'data' folder:", os.listdir(data_folder))
else:
    print("The 'data' folder does not exist. Creating it now...")
    os.makedirs(data_folder)
    print("Please move the ZIP file to the 'data' folder and rerun the code.")
    raise FileNotFoundError("The 'data' folder was created, but the ZIP file is missing.")

# Check if the ZIP file exists
if not os.path.exists(zip_path):
    raise FileNotFoundError(f"The ZIP file {zip_path} does not exist. Please check the file name and path.")

# 2. Extract the ZIP file
extract_path = 'data/25-ward-model'

# Create the extraction directory if it doesn’t exist
if not os.path.exists(extract_path):
    os.makedirs(extract_path)

# Extract the ZIP file
try:
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        print("Contents of the ZIP file:", zip_ref.namelist())
        zip_ref.extractall(extract_path)
    print("ZIP file extracted successfully.")
except Exception as e:
    print(f"Error extracting ZIP file: {e}")
    raise

# List the extracted files to confirm
extracted_files = os.listdir(extract_path)
print("Extracted files:", extracted_files)

# 3. Load ward boundary data (Shapefile)
# Look for a .shp file in the extracted files
shp_file = next((f for f in extracted_files if f.endswith('.shp')), None)
if not shp_file:
    raise FileNotFoundError("No .shp file found in the extracted files.")
shp_path = os.path.join(extract_path, shp_file)
print(f"Loading Shapefile: {shp_path}")

wards_gdf = gpd.read_file(shp_path)

# Inspect the Shapefile
print("Columns in the Shapefile:")
print(wards_gdf.columns)

# Print the first few rows to identify the ward name field
print("\nFirst few rows of the Shapefile:")
print(wards_gdf.head())

# Specify the ward name field manually
ward_name_field = 'AREA_NAME'  # Confirmed from your output

# Verify the ward name field exists
if ward_name_field not in wards_gdf.columns:
    raise ValueError(f"Ward name field '{ward_name_field}' not found in the Shapefile. Available columns: {wards_gdf.columns}")
print(f"\nFirst few ward names (using {ward_name_field}):")
print(wards_gdf[ward_name_field].head())
print(f"\nNumber of wards in the Shapefile: {len(wards_gdf)}")

# 4. Get your ward counts (already calculated in your code)
ward_counts = combined_df['Ward'].value_counts()

# Convert ward_counts to a DataFrame for merging with Shapefile
ward_counts_df = ward_counts.reset_index()
ward_counts_df.columns = ['Ward', 'Request_Count']

# 5. Clean ward names in ward_counts_df to match the Shapefile
ward_counts_df['Ward'] = ward_counts_df['Ward'].str.replace(r'\s*\(\d+\)', '', regex=True)

# Debug: Print ward names to check for mismatches
print("\nWard names in Shapefile:")
print(wards_gdf[ward_name_field].tolist())
print("\nWard names in ward_counts_df (after cleaning):")
print(ward_counts_df['Ward'].tolist())

# 6. Merge the Shapefile with your request counts
wards_gdf = wards_gdf.merge(ward_counts_df, left_on=ward_name_field, right_on='Ward', how='left')

# Debug: Check for unmatched wards after the merge
unmatched_wards = wards_gdf[wards_gdf['Request_Count'].isna()][ward_name_field].tolist()
if unmatched_wards:
    print("\nWarning: The following wards in the Shapefile did not match any wards in ward_counts_df:")
    print(unmatched_wards)
else:
    print("\nAll wards matched successfully.")

# 7. Create map centered on Toronto
toronto_center = [43.70, -79.42]
m = folium.Map(location=toronto_center, zoom_start=11)

# 8. Create a choropleth map for 311 requests
folium.Choropleth(
    geo_data=wards_gdf,
    name='311 Service Requests',
    data=ward_counts_df,
    columns=['Ward', 'Request_Count'],
    key_on=f'feature.properties.{ward_name_field}',
    fill_color='YlOrRd',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='311 Service Requests',
    nan_fill_color='black',
    nan_fill_opacity=0.3
).add_to(m)

# 9. Add tooltips to show ward name and request count on hover
folium.GeoJson(
    wards_gdf,
    style_function=lambda feature: {
        'fillColor': 'transparent',
        'color': 'black',
        'weight': 0.5,
        'fillOpacity': 0
    },
    tooltip=folium.GeoJsonTooltip(
        fields=['Ward', 'Request_Count'],
        aliases=['Ward:', 'Requests:'],
        localize=True
    )
).add_to(m)

# 10. Add ward name labels at the centroid of each ward
for _, row in wards_gdf.iterrows():
    # Use the LONGITUDE and LATITUDE columns for the centroid
    centroid = [row['LATITUDE'], row['LONGITUDE']]
    ward_name = row[ward_name_field]
    
    # Create a custom DivIcon with the ward name as text
    folium.Marker(
        location=centroid,
        icon=folium.features.DivIcon(
            icon_size=(150, 36),  # Adjust size as needed
            icon_anchor=(75, 18),  # Center the text
            html=f'<div style="font-size: 10pt; color: black; text-align: center; white-space: nowrap;">{ward_name}</div>'
        )
    ).add_to(m)

# 11. Add layer control
folium.LayerControl().add_to(m)

# 12. Display the map inline
print("Displaying the map with ward name labels below:")
display(m)  # Display the map in the notebook
Current working directory: /Users/priyankmali/Final Project EECS 1516 
Contents of the 'data' folder: ['25-ward-model', '25-ward-model-december-2018-wgs84-latitude-longitude.zip', 'SR_all_years_cleaned.csv', 'SR_train_with_weather.csv', '.ipynb_checkpoints', 'SR2021.csv', 'SR2020.csv', 'SR2022.csv', 'SR2023.csv', 'SR_all_years.csv', 'SR2018.csv', 'SR2024.csv', 'SR2025.csv', 'SR2019.csv']
Contents of the ZIP file: ['WARD_WGS84.cpg', 'WARD_WGS84.dbf', 'WARD_WGS84.prj', 'WARD_WGS84.sbn', 'WARD_WGS84.sbx', 'WARD_WGS84.shp', 'WARD_WGS84.shp.xml', 'WARD_WGS84.shx', 'WARD_WGS84_readme.txt']
ZIP file extracted successfully.
Extracted files: ['WARD_WGS84.sbx', 'WARD_WGS84.shx', 'WARD_WGS84.cpg', 'WARD_WGS84.shp', 'WARD_WGS84.dbf', 'WARD_WGS84.shp.xml', 'WARD_WGS84.sbn', 'WARD_WGS84_readme.txt', 'WARD_WGS84.prj']
Loading Shapefile: data/25-ward-model/WARD_WGS84.shp
Columns in the Shapefile:
Index(['AREA_ID', 'AREA_TYPE', 'AREA_S_CD', 'AREA_L_CD', 'AREA_NAME', 'X', 'Y',
       'LONGITUDE', 'LATITUDE', 'geometry'],
      dtype='object')

First few rows of the Shapefile:
   AREA_ID AREA_TYPE AREA_S_CD AREA_L_CD            AREA_NAME           X  \
0  2551040      WD18        16        16      Don Valley East  318237.290   
1  2551044      WD18        03        03  Etobicoke-Lakeshore  303099.474   
2  2551048      WD18        15        15      Don Valley West  314825.876   
3  2551052      WD18        23        23    Scarborough North  324522.149   
4  2551056      WD18        11        11  University-Rosedale  313306.543   

           Y  LONGITUDE   LATITUDE  \
0  4844000.0  -79.33298  43.739716   
1  4831000.0  -79.52087  43.621646   
2  4843000.0  -79.37536  43.728396   
3  4852000.0  -79.25467  43.809672   
4  4837000.0  -79.39432  43.671139   

                                            geometry  
0  POLYGON ((-79.31335 43.71699, -79.3195 43.7156...  
1  POLYGON ((-79.49777 43.65198, -79.49725 43.651...  
2  POLYGON ((-79.35232 43.71573, -79.35209 43.715...  
3  POLYGON ((-79.22591 43.8396, -79.22556 43.8394...  
4  POLYGON ((-79.39004 43.6905, -79.39004 43.6905...  

First few ward names (using AREA_NAME):
0        Don Valley East
1    Etobicoke-Lakeshore
2        Don Valley West
3      Scarborough North
4    University-Rosedale
Name: AREA_NAME, dtype: object

Number of wards in the Shapefile: 25

Ward names in Shapefile:
['Don Valley East', 'Etobicoke-Lakeshore', 'Don Valley West', 'Scarborough North', 'University-Rosedale', 'Spadina-Fort York', 'Davenport', 'York South-Weston', 'Don Valley North', 'Eglinton-Lawrence', 'Etobicoke Centre', 'Etobicoke North', 'Parkdale-High Park', "Toronto-St. Paul's", 'Scarborough-Agincourt', 'Scarborough Centre', 'Scarborough-Guildwood', 'Scarborough-Rouge Park', 'Scarborough Southwest', 'Beaches-East York', 'Willowdale', 'York Centre', 'Humber River-Black Creek', 'Toronto-Danforth', 'Toronto Centre']

Ward names in ward_counts_df (after cleaning):
['Toronto-Danforth', 'Etobicoke-Lakeshore', 'Beaches-East York', 'University-Rosedale', 'Davenport', 'Eglinton-Lawrence', 'Parkdale-High Park', 'Scarborough Southwest', "Toronto-St. Paul's", 'Etobicoke Centre', 'York South-Weston', 'Don Valley West', 'Spadina-Fort York', 'Scarborough-Rouge Park', 'Scarborough Centre', 'York Centre', 'Toronto Centre', 'Etobicoke North', 'Scarborough-Guildwood', 'Willowdale', 'Don Valley North', 'Humber River-Black Creek', 'Scarborough-Agincourt', 'Scarborough North', 'Don Valley East', 'Unknown']

All wards matched successfully.
Displaying the map with ward name labels below:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [16]:
import folium
import pandas as pd
import geopandas as gpd
from folium.plugins import HeatMap
import zipfile
import os
from IPython.display import display  # Import display for inline rendering
import matplotlib.pyplot as plt
import seaborn as sns

# Toronto ward coordinates (latitude, longitude) - keeping this for reference
toronto_ward_coords = {
    'Toronto-Danforth (14)': [43.6785, -79.3526],
    'Etobicoke-Lakeshore (03)': [43.6216, -79.4996],
    'Beaches-East York (19)': [43.6938, -79.2989],
    'University-Rosedale (11)': [43.6659, -79.4009],
    'Davenport (09)': [43.6690, -79.4611],
    'Eglinton-Lawrence (08)': [43.7110, -79.4350],
    'Parkdale-High Park (04)': [43.6452, -79.4496],
    'Scarborough Southwest (20)': [43.7299, -79.2569],
    'Toronto-St. Paul\'s (12)': [43.6895, -79.4083],
    'Etobicoke Centre (02)': [43.6488, -79.5362],
    'Don Valley East (16)': [43.7412, -79.3473],
    'Don Valley North (17)': [43.7931, -79.3624],
    'Don Valley West (15)': [43.7126, -79.3742],
    'Etobicoke North (01)': [43.7224, -79.5678],
    'Humber River-Black Creek (07)': [43.7589, -79.5136],
    'Scarborough Centre (21)': [43.7766, -79.2579],
    'Scarborough North (23)': [43.8345, -79.2626],
    'Scarborough-Agincourt (22)': [43.7864, -79.2917],
    'Scarborough-Guildwood (24)': [43.7631, -79.1887],
    'Scarborough-Rouge Park (25)': [43.8026, -79.1872],
    'Spadina-Fort York (10)': [43.6396, -79.4029],
    'Toronto Centre (13)': [43.6532, -79.3832],
    'Willowdale (18)': [43.7744, -79.4056],
    'York Centre (06)': [43.7462, -79.4563],
    'York South-Weston (05)': [43.6948, -79.5026]
}

# 1. Verify the 'data' folder and file
data_folder = 'data'
zip_path = 'data/25-ward-model-december-2018-wgs84-latitude-longitude.zip'

# Print the current working directory and contents of the 'data' folder
print("Current working directory:", os.getcwd())
if os.path.exists(data_folder):
    print("Contents of the 'data' folder:", os.listdir(data_folder))
else:
    print("The 'data' folder does not exist. Creating it now...")
    os.makedirs(data_folder)
    print("Please move the ZIP file to the 'data' folder and rerun the code.")
    raise FileNotFoundError("The 'data' folder was created, but the ZIP file is missing.")

# Check if the ZIP file exists
if not os.path.exists(zip_path):
    raise FileNotFoundError(f"The ZIP file {zip_path} does not exist. Please check the file name and path.")

# 2. Extract the ZIP file
extract_path = 'data/25-ward-model'

# Create the extraction directory if it doesn’t exist
if not os.path.exists(extract_path):
    os.makedirs(extract_path)

# Extract the ZIP file
try:
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        print("Contents of the ZIP file:", zip_ref.namelist())
        zip_ref.extractall(extract_path)
    print("ZIP file extracted successfully.")
except Exception as e:
    print(f"Error extracting ZIP file: {e}")
    raise

# List the extracted files to confirm
extracted_files = os.listdir(extract_path)
print("Extracted files:", extracted_files)

# 3. Load ward boundary data (Shapefile)
# Look for a .shp file in the extracted files
shp_file = next((f for f in extracted_files if f.endswith('.shp')), None)
if not shp_file:
    raise FileNotFoundError("No .shp file found in the extracted files.")
shp_path = os.path.join(extract_path, shp_file)
print(f"Loading Shapefile: {shp_path}")

wards_gdf = gpd.read_file(shp_path)

# Inspect the Shapefile
print("Columns in the Shapefile:")
print(wards_gdf.columns)

# Print the first few rows to identify the ward name field
print("\nFirst few rows of the Shapefile:")
print(wards_gdf.head())

# Specify the ward name field manually
ward_name_field = 'AREA_NAME'  # Confirmed from your output

# Verify the ward name field exists
if ward_name_field not in wards_gdf.columns:
    raise ValueError(f"Ward name field '{ward_name_field}' not found in the Shapefile. Available columns: {wards_gdf.columns}")
print(f"\nFirst few ward names (using {ward_name_field}):")
print(wards_gdf[ward_name_field].head())
print(f"\nNumber of wards in the Shapefile: {len(wards_gdf)}")

# 4. Inspect combined_df to confirm columns
print("\nColumns in combined_df:")
print(combined_df.columns)

# 5. Check the existing 'Season' column
print("\nUnique values in the 'Season' column:")
print(combined_df['Season'].unique())

# Verify the 'Season' column has the expected values
expected_seasons = {'Winter', 'Spring', 'Summer', 'Fall'}
if not set(combined_df['Season'].unique()).issubset(expected_seasons):
    print("Recreating 'Season' column as it does not contain expected values...")
    date_column = 'Creation Date'  # Corrected column name
    if date_column not in combined_df.columns:
        raise KeyError(f"Date column '{date_column}' not found in combined_df. Available columns: {combined_df.columns}")

    # Ensure the date column is in datetime format
    combined_df[date_column] = pd.to_datetime(combined_df[date_column])

    def get_season(month):
        if month in [12, 1, 2]:
            return 'Winter'
        elif month in [3, 4, 5]:
            return 'Spring'
        elif month in [6, 7, 8]:
            return 'Summer'
        else:  # months 9, 10, 11
            return 'Fall'

    # Add Season column
    combined_df['Season'] = combined_df[date_column].dt.month.apply(get_season)
    print("Updated unique values in the 'Season' column:")
    print(combined_df['Season'].unique())

# 6. Identify the top 5 service request types overall
request_type_column = 'Service Request Type'  # Confirmed from your output
if request_type_column not in combined_df.columns:
    raise KeyError(f"Request type column '{request_type_column}' not found in combined_df. Available columns: {combined_df.columns}")

top_requests = combined_df[request_type_column].value_counts().head(5).index.tolist()
print("\nTop 5 Service Request Types Overall:")
print(top_requests)

# 7. Create a pivot table for the heatmap
pivot_table = pd.crosstab(combined_df['Season'], 
                          combined_df[request_type_column], 
                          values=combined_df[request_type_column], 
                          aggfunc='count').loc[['Winter', 'Spring', 'Summer', 'Fall'], top_requests]

# 8. Plot the heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(pivot_table, annot=True, fmt='.0f', cmap='YlGnBu', cbar_kws={'label': 'Number of Requests'})
plt.title('Heatmap of Top 5 Service Request Types by Season')
plt.xlabel('Service Request Type')
plt.ylabel('Season')
plt.tight_layout()
plt.show()

# 9. Summary of top requests by season
print("\nTop 5 Service Request Types by Season:")
for season in ['Winter', 'Spring', 'Summer', 'Fall']:
    season_counts = (combined_df[combined_df['Season'] == season][request_type_column]
                     .value_counts().head(5))
    print(f"\n{season}:")
    print(season_counts)

# 10. Calculate service requests by season and ward
seasonal_counts = combined_df.groupby(['Season', 'Ward']).size().reset_index(name='Request_Count')

# Pivot the data to have a column for each season
seasonal_pivot = seasonal_counts.pivot(index='Ward', columns='Season', values='Request_Count').fillna(0).reset_index()

# Clean ward names in seasonal_pivot to match the Shapefile
seasonal_pivot['Ward'] = seasonal_pivot['Ward'].str.replace(r'\s*\(\d+\)', '', regex=True)

# 11. Merge the seasonal data with the Shapefile
wards_gdf_seasonal = wards_gdf.merge(seasonal_pivot, left_on=ward_name_field, right_on='Ward', how='left')

# Debug: Check for unmatched wards after the merge
unmatched_wards = wards_gdf_seasonal[wards_gdf_seasonal['Winter'].isna()][ward_name_field].tolist()
if unmatched_wards:
    print("\nWarning: The following wards in the Shapefile did not match any wards in seasonal_pivot:")
    print(unmatched_wards)
else:
    print("\nAll wards matched successfully for seasonal data.")

# 12. Create map centered on Toronto
toronto_center = [43.70, -79.42]
m = folium.Map(location=toronto_center, zoom_start=11)

# 13. Create a choropleth layer for each season
seasons = ['Winter', 'Spring', 'Summer', 'Fall']
season_colors = {
    'Winter': 'Blues',
    'Spring': 'Greens',
    'Summer': 'Oranges',
    'Fall': 'Reds'
}
for season in seasons:
    folium.Choropleth(
        geo_data=wards_gdf_seasonal,
        name=f'311 Requests - {season}',
        data=wards_gdf_seasonal,
        columns=['Ward', season],
        key_on=f'feature.properties.{ward_name_field}',
        fill_color=season_colors[season],  # Use a different color for each season
        fill_opacity=0.7,
        line_opacity=0.2,
        legend_name=f'311 Service Requests ({season})',
        nan_fill_color='black',
        nan_fill_opacity=0.3
    ).add_to(m)

# 14. Add tooltips to show ward name and request count for each season
for season in seasons:
    folium.GeoJson(
        wards_gdf_seasonal,
        style_function=lambda feature: {
            'fillColor': 'transparent',
            'color': 'black',
            'weight': 0.5,
            'fillOpacity': 0
        },
        tooltip=folium.GeoJsonTooltip(
            fields=['Ward', season],
            aliases=['Ward:', f'Requests ({season}):'],
            localize=True
        ),
        name=f'Tooltips - {season}'  # Name the layer for the layer control
    ).add_to(m)

# 15. Add ward name labels at the centroid of each ward
for _, row in wards_gdf.iterrows():
    centroid = [row['LATITUDE'], row['LONGITUDE']]
    ward_name = row[ward_name_field]
    folium.Marker(
        location=centroid,
        icon=folium.features.DivIcon(
            icon_size=(150, 36),
            icon_anchor=(75, 18),
            html=f'<div style="font-size: 10pt; color: black; text-align: center; white-space: nowrap;">{ward_name}</div>'
        )
    ).add_to(m)

# 16. Add layer control
folium.LayerControl().add_to(m)

# 17. Display the map inline
print("Displaying the map with seasonal layers below:")
display(m)
Current working directory: /Users/priyankmali/Final Project EECS 1516 
Contents of the 'data' folder: ['25-ward-model', '25-ward-model-december-2018-wgs84-latitude-longitude.zip', 'SR_all_years_cleaned.csv', 'SR_train_with_weather.csv', '.ipynb_checkpoints', 'SR2021.csv', 'SR2020.csv', 'SR2022.csv', 'SR2023.csv', 'SR_all_years.csv', 'SR2018.csv', 'SR2024.csv', 'SR2025.csv', 'SR2019.csv']
Contents of the ZIP file: ['WARD_WGS84.cpg', 'WARD_WGS84.dbf', 'WARD_WGS84.prj', 'WARD_WGS84.sbn', 'WARD_WGS84.sbx', 'WARD_WGS84.shp', 'WARD_WGS84.shp.xml', 'WARD_WGS84.shx', 'WARD_WGS84_readme.txt']
ZIP file extracted successfully.
Extracted files: ['WARD_WGS84.sbx', 'WARD_WGS84.shx', 'WARD_WGS84.cpg', 'WARD_WGS84.shp', 'WARD_WGS84.dbf', 'WARD_WGS84.shp.xml', 'WARD_WGS84.sbn', 'WARD_WGS84_readme.txt', 'WARD_WGS84.prj']
Loading Shapefile: data/25-ward-model/WARD_WGS84.shp
Columns in the Shapefile:
Index(['AREA_ID', 'AREA_TYPE', 'AREA_S_CD', 'AREA_L_CD', 'AREA_NAME', 'X', 'Y',
       'LONGITUDE', 'LATITUDE', 'geometry'],
      dtype='object')

First few rows of the Shapefile:
   AREA_ID AREA_TYPE AREA_S_CD AREA_L_CD            AREA_NAME           X  \
0  2551040      WD18        16        16      Don Valley East  318237.290   
1  2551044      WD18        03        03  Etobicoke-Lakeshore  303099.474   
2  2551048      WD18        15        15      Don Valley West  314825.876   
3  2551052      WD18        23        23    Scarborough North  324522.149   
4  2551056      WD18        11        11  University-Rosedale  313306.543   

           Y  LONGITUDE   LATITUDE  \
0  4844000.0  -79.33298  43.739716   
1  4831000.0  -79.52087  43.621646   
2  4843000.0  -79.37536  43.728396   
3  4852000.0  -79.25467  43.809672   
4  4837000.0  -79.39432  43.671139   

                                            geometry  
0  POLYGON ((-79.31335 43.71699, -79.3195 43.7156...  
1  POLYGON ((-79.49777 43.65198, -79.49725 43.651...  
2  POLYGON ((-79.35232 43.71573, -79.35209 43.715...  
3  POLYGON ((-79.22591 43.8396, -79.22556 43.8394...  
4  POLYGON ((-79.39004 43.6905, -79.39004 43.6905...  

First few ward names (using AREA_NAME):
0        Don Valley East
1    Etobicoke-Lakeshore
2        Don Valley West
3      Scarborough North
4    University-Rosedale
Name: AREA_NAME, dtype: object

Number of wards in the Shapefile: 25

Columns in combined_df:
Index(['Creation Date', 'Status', 'First 3 Chars of Postal Code', 'Ward',
       'Service Request Type', 'Division', 'Section', 'Year', 'Month',
       'DayOfWeek', 'Hour', 'Season'],
      dtype='object')

Unique values in the 'Season' column:
['Winter' 'Spring' 'Summer' 'Fall']

Top 5 Service Request Types Overall:
['Residential: Bin: Repair or Replace Lid', 'Property Standards', 'Road - Pot hole', 'Res / Garbage / Not Picked Up', 'Residential Furniture / Not Picked Up']
No description has been provided for this image
Top 5 Service Request Types by Season:

Winter:
Service Request Type
Road - Pot hole                            23941
Residential: Bin: Repair or Replace Lid    21916
Publication Request - Solid Waste          20652
Property Standards                         18785
Res / Garbage / Not Picked Up              15227
Name: count, dtype: int64

Spring:
Service Request Type
Road - Pot hole                                    35207
Residential: Bin: Repair or Replace Lid            28035
Property Standards                                 21335
Residential Furniture / Not Picked Up              13681
Residential: Bin: Repair or Replace Body/Handle    13573
Name: count, dtype: int64

Summer:
Service Request Type
Residential: Bin: Repair or Replace Lid    33857
Property Standards                         29675
General Pruning                            27111
Long Grass and Weeds                       22751
CADAVER WILDLIFE                           18964
Name: count, dtype: int64

Fall:
Service Request Type
Residential: Bin: Repair or Replace Lid    28943
Property Standards                         24650
CADAVER WILDLIFE                           16031
Residential Furniture / Not Picked Up      14644
Cadaver - Wildlife                         14287
Name: count, dtype: int64

All wards matched successfully for seasonal data.
Displaying the map with seasonal layers below:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [17]:
import pandas as pd

# Ensure 'Creation Date' is in datetime format
combined_df['Creation Date'] = pd.to_datetime(combined_df['Creation Date'])

# Filter combined_df to the period from 2018-01-01 to 2025-03-31
start_date = pd.to_datetime('2018-01-01')
end_date = pd.to_datetime('2025-03-31')
combined_df = combined_df[(combined_df['Creation Date'] >= start_date) & (combined_df['Creation Date'] <= end_date)]

print("Date range of combined_df after filtering:")
print(f"Min date: {combined_df['Creation Date'].min()}")
print(f"Max date: {combined_df['Creation Date'].max()}")
Date range of combined_df after filtering:
Min date: 2018-01-01 00:06:06
Max date: 2025-01-31 23:56:11
In [18]:
import numpy as np

# Create a date range from 2018-01-01 to 2025-03-26
dates = pd.date_range(start='2018-01-01', end='2025-03-26', freq='D')

# Simulate daily weather data for Toronto
np.random.seed(42)  # For reproducibility
weather_data = {
    'Date': dates,
    'Mean_Temperature_C': np.random.normal(loc=10, scale=10, size=len(dates)),  # Placeholder
    'Total_Rainfall_mm': np.random.exponential(scale=2, size=len(dates)),  # Placeholder
    'Total_Snowfall_cm': np.zeros(len(dates))  # Placeholder
}

# Convert to DataFrame
weather_df = pd.DataFrame(weather_data)
weather_df['Month'] = weather_df['Date'].dt.month

# Adjust temperature based on monthly averages
# Jan: -2.3°C, Jul: 24.8°C, with seasonal variation
for month in range(1, 13):
    if month in [1, 2, 12]:  # Winter months
        weather_df.loc[weather_df['Month'] == month, 'Mean_Temperature_C'] = np.random.normal(loc=-2.3, scale=3, size=len(weather_df[weather_df['Month'] == month]))
    elif month in [6, 7, 8]:  # Summer months
        weather_df.loc[weather_df['Month'] == month, 'Mean_Temperature_C'] = np.random.normal(loc=24.8, scale=3, size=len(weather_df[weather_df['Month'] == month]))
    elif month in [3, 4, 5]:  # Spring months
        weather_df.loc[weather_df['Month'] == month, 'Mean_Temperature_C'] = np.random.normal(loc=12, scale=5, size=len(weather_df[weather_df['Month'] == month]))
    else:  # Fall months (9, 10, 11)
        weather_df.loc[weather_df['Month'] == month, 'Mean_Temperature_C'] = np.random.normal(loc=15, scale=5, size=len(weather_df[weather_df['Month'] == month]))

# Adjust rainfall: February (19mm/month), July (53mm/month)
# Distribute monthly rainfall across days
monthly_rainfall = {1: 54, 2: 19, 3: 80, 4: 80, 5: 48, 6: 81, 7: 53, 8: 75, 9: 78, 10: 78, 11: 75, 12: 54}  # From web ID: 6, 19
for month in range(1, 13):
    days_in_month = weather_df[weather_df['Month'] == month].shape[0]
    monthly_total = monthly_rainfall[month]
    daily_avg = monthly_total / (days_in_month / 2)  # Assume rain on half the days
    rain_days = np.random.choice([0, 1], size=days_in_month, p=[0.5, 0.5])  # 50% chance of rain
    weather_df.loc[weather_df['Month'] == month, 'Total_Rainfall_mm'] = rain_days * np.random.exponential(scale=daily_avg, size=days_in_month)

# Adjust snowfall: November to April, with January peak (57mm/month)
# Snowfall occurs from November to April
for month in range(1, 13):
    days_in_month = weather_df[weather_df['Month'] == month].shape[0]
    if month in [11, 12, 1, 2, 3, 4]:  # Snowfall months
        monthly_snow = 57 if month == 1 else 223 / 6  # Distribute annual 223mm over 6 months, with January peak
        daily_avg = monthly_snow / (days_in_month / 4)  # Assume snow on 25% of days
        snow_days = np.random.choice([0, 1], size=days_in_month, p=[0.75, 0.25])  # 25% chance of snow
        weather_df.loc[weather_df['Month'] == month, 'Total_Snowfall_cm'] = snow_days * np.random.exponential(scale=daily_avg, size=days_in_month)
    else:
        weather_df.loc[weather_df['Month'] == month, 'Total_Snowfall_cm'] = 0

# Drop the Month column
weather_df = weather_df.drop(columns=['Month'])

print("Sample of simulated weather data (2018 to March 2025):")
print(weather_df.head())
Sample of simulated weather data (2018 to March 2025):
        Date  Mean_Temperature_C  Total_Rainfall_mm  Total_Snowfall_cm
0 2018-01-01            0.721472           0.000000           0.000000
1 2018-01-02           -0.746016           0.591381           0.000000
2 2018-01-03           -2.221386           0.022729           0.000000
3 2018-01-04           -3.210693           0.108828           1.077358
4 2018-01-05           -2.542752           0.000000           0.000000
In [19]:
# Extract the date (without time) for daily aggregation
combined_df['Date'] = combined_df['Creation Date'].dt.date

# Aggregate service requests by ward and date
daily_requests = combined_df.groupby(['Ward', 'Date']).size().reset_index(name='Request_Count')

# Convert 'Date' back to datetime for merging
daily_requests['Date'] = pd.to_datetime(daily_requests['Date'])

# Merge with the original combined_df to get the 'Season' column
combined_df['Date'] = pd.to_datetime(combined_df['Date'])
daily_requests = daily_requests.merge(
    combined_df[['Ward', 'Date', 'Season']].drop_duplicates(),
    on=['Ward', 'Date'],
    how='left'
)

print("Sample of daily service request counts by ward:")
print(daily_requests.head())
Sample of daily service request counts by ward:
                     Ward       Date  Request_Count  Season
0  Beaches-East York (19) 2018-01-01             30  Winter
1  Beaches-East York (19) 2018-01-02             93  Winter
2  Beaches-East York (19) 2018-01-03             98  Winter
3  Beaches-East York (19) 2018-01-04             84  Winter
4  Beaches-East York (19) 2018-01-05             92  Winter
In [20]:
# Merge weather data with daily service request counts
daily_requests_weather = daily_requests.merge(weather_df, on='Date', how='left')

print("Sample of merged data:")
print(daily_requests_weather.head())
Sample of merged data:
                     Ward       Date  Request_Count  Season  \
0  Beaches-East York (19) 2018-01-01             30  Winter   
1  Beaches-East York (19) 2018-01-02             93  Winter   
2  Beaches-East York (19) 2018-01-03             98  Winter   
3  Beaches-East York (19) 2018-01-04             84  Winter   
4  Beaches-East York (19) 2018-01-05             92  Winter   

   Mean_Temperature_C  Total_Rainfall_mm  Total_Snowfall_cm  
0            0.721472           0.000000           0.000000  
1           -0.746016           0.591381           0.000000  
2           -2.221386           0.022729           0.000000  
3           -3.210693           0.108828           1.077358  
4           -2.542752           0.000000           0.000000  
In [21]:
import scipy.stats as stats
import numpy as np

# Initialize a dictionary to store correlations
correlations = {
    'Ward': [],
    'Season': [],
    'Temperature_Correlation': [],
    'Rainfall_Correlation': [],
    'Snowfall_Correlation': []
}

# List of seasons and wards
seasons = ['Winter', 'Spring', 'Summer', 'Fall']
wards = daily_requests_weather['Ward'].unique()

# Compute correlations for each ward and season
for ward in wards:
    for season in seasons:
        # Filter data for the current ward and season
        subset = daily_requests_weather[(daily_requests_weather['Ward'] == ward) & (daily_requests_weather['Season'] == season)]
        
        if len(subset) < 2:  # Need at least 2 data points to compute correlation
            corr_temp = corr_rain = corr_snow = np.nan
        else:
            # Compute Pearson correlations
            corr_temp, _ = stats.pearsonr(subset['Request_Count'], subset['Mean_Temperature_C'])
            corr_rain, _ = stats.pearsonr(subset['Request_Count'], subset['Total_Rainfall_mm'])
            corr_snow, _ = stats.pearsonr(subset['Request_Count'], subset['Total_Snowfall_cm'])
        
        # Store the results
        correlations['Ward'].append(ward)
        correlations['Season'].append(season)
        correlations['Temperature_Correlation'].append(corr_temp)
        correlations['Rainfall_Correlation'].append(corr_rain)
        correlations['Snowfall_Correlation'].append(corr_snow)

# Create a DataFrame from the correlations
corr_df = pd.DataFrame(correlations)

print("Sample of correlation results:")
print(corr_df.head())
/var/folders/07/3vs0pyfj68xd69bt_2bj8ln40000gn/T/ipykernel_3516/1751068593.py:29: ConstantInputWarning: An input array is constant; the correlation coefficient is not defined.
  corr_snow, _ = stats.pearsonr(subset['Request_Count'], subset['Total_Snowfall_cm'])
Sample of correlation results:
                     Ward  Season  Temperature_Correlation  \
0  Beaches-East York (19)  Winter                -0.041033   
1  Beaches-East York (19)  Spring                 0.030053   
2  Beaches-East York (19)  Summer                 0.060603   
3  Beaches-East York (19)    Fall                -0.040910   
4          Davenport (09)  Winter                -0.021434   

   Rainfall_Correlation  Snowfall_Correlation  
0              0.029495             -0.012289  
1             -0.049212             -0.052994  
2             -0.031875                   NaN  
3              0.041697             -0.004217  
4             -0.020808              0.005881  
/var/folders/07/3vs0pyfj68xd69bt_2bj8ln40000gn/T/ipykernel_3516/1751068593.py:27: ConstantInputWarning: An input array is constant; the correlation coefficient is not defined.
  corr_temp, _ = stats.pearsonr(subset['Request_Count'], subset['Mean_Temperature_C'])
/var/folders/07/3vs0pyfj68xd69bt_2bj8ln40000gn/T/ipykernel_3516/1751068593.py:28: ConstantInputWarning: An input array is constant; the correlation coefficient is not defined.
  corr_rain, _ = stats.pearsonr(subset['Request_Count'], subset['Total_Rainfall_mm'])
In [22]:
import seaborn as sns
import matplotlib.pyplot as plt

# Create a heatmap for each season
for season in seasons:
    # Filter correlations for the current season
    season_corr = corr_df[corr_df['Season'] == season].set_index('Ward')
    
    # Drop the 'Season' column for the heatmap
    season_corr = season_corr.drop(columns=['Season'])
    
    # Plot the heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(season_corr, annot=True, cmap='coolwarm', center=0, vmin=-1, vmax=1)
    plt.title(f'Pearson Correlations: Service Requests vs Weather Attributes ({season})')
    plt.xlabel('Weather Attribute')
    plt.ylabel('Ward')
    plt.tight_layout()
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [23]:
import pandas as pd
import numpy as np

# Ensure 'Creation Date' is in datetime format and filter the date range
combined_df['Creation Date'] = pd.to_datetime(combined_df['Creation Date'])
start_date = pd.to_datetime('2018-01-01')
end_date = pd.to_datetime('2025-03-31')
combined_df = combined_df[(combined_df['Creation Date'] >= start_date) & (combined_df['Creation Date'] <= end_date)]

# Extract the date (without time) for merging with weather data
combined_df['Date'] = combined_df['Creation Date'].dt.date
combined_df['Date'] = pd.to_datetime(combined_df['Date'])

# Simulate weather data (replace with actual data from Environment Canada)
dates = pd.date_range(start='2018-01-01', end='2025-03-26', freq='D')
np.random.seed(42)
weather_data = {
    'Date': dates,
    'Mean_Temperature_C': np.random.normal(loc=10, scale=10, size=len(dates)),
    'Total_Rainfall_mm': np.random.exponential(scale=2, size=len(dates)),
    'Total_Snowfall_cm': np.zeros(len(dates))
}
weather_df = pd.DataFrame(weather_data)
weather_df['Month'] = weather_df['Date'].dt.month
for month in range(1, 13):
    if month in [1, 2, 12]:
        weather_df.loc[weather_df['Month'] == month, 'Mean_Temperature_C'] = np.random.normal(loc=-2.3, scale=3, size=len(weather_df[weather_df['Month'] == month]))
    elif month in [6, 7, 8]:
        weather_df.loc[weather_df['Month'] == month, 'Mean_Temperature_C'] = np.random.normal(loc=24.8, scale=3, size=len(weather_df[weather_df['Month'] == month]))
    elif month in [3, 4, 5]:
        weather_df.loc[weather_df['Month'] == month, 'Mean_Temperature_C'] = np.random.normal(loc=12, scale=5, size=len(weather_df[weather_df['Month'] == month]))
    else:
        weather_df.loc[weather_df['Month'] == month, 'Mean_Temperature_C'] = np.random.normal(loc=15, scale=5, size=len(weather_df[weather_df['Month'] == month]))
monthly_rainfall = {1: 54, 2: 19, 3: 80, 4: 80, 5: 48, 6: 81, 7: 53, 8: 75, 9: 78, 10: 78, 11: 75, 12: 54}
for month in range(1, 13):
    days_in_month = weather_df[weather_df['Month'] == month].shape[0]
    monthly_total = monthly_rainfall[month]
    daily_avg = monthly_total / (days_in_month / 2)
    rain_days = np.random.choice([0, 1], size=days_in_month, p=[0.5, 0.5])
    weather_df.loc[weather_df['Month'] == month, 'Total_Rainfall_mm'] = rain_days * np.random.exponential(scale=daily_avg, size=days_in_month)
for month in range(1, 13):
    days_in_month = weather_df[weather_df['Month'] == month].shape[0]
    if month in [11, 12, 1, 2, 3, 4]:
        monthly_snow = 57 if month == 1 else 223 / 6
        daily_avg = monthly_snow / (days_in_month / 4)
        snow_days = np.random.choice([0, 1], size=days_in_month, p=[0.75, 0.25])
        weather_df.loc[weather_df['Month'] == month, 'Total_Snowfall_cm'] = snow_days * np.random.exponential(scale=daily_avg, size=days_in_month)
    else:
        weather_df.loc[weather_df['Month'] == month, 'Total_Snowfall_cm'] = 0
weather_df = weather_df.drop(columns=['Month'])

# Merge combined_df with weather data
combined_df_weather = combined_df.merge(weather_df, on='Date', how='left')

print("Sample of merged data:")
print(combined_df_weather[['Creation Date', 'Date', 'Service Request Type', 'Season', 'Mean_Temperature_C', 'Total_Rainfall_mm', 'Total_Snowfall_cm']].head())
Sample of merged data:
        Creation Date       Date         Service Request Type  Season  \
0 2018-01-01 00:06:06 2018-01-01                        Noise  Winter   
1 2018-01-01 00:14:04 2018-01-01                        Noise  Winter   
2 2018-01-01 00:17:19 2018-01-01          INJUR/DIST WILDLIFE  Winter   
3 2018-01-01 00:24:35 2018-01-01     Watermain-Possible Break  Winter   
4 2018-01-01 00:28:44 2018-01-01  Water Service Line-No Water  Winter   

   Mean_Temperature_C  Total_Rainfall_mm  Total_Snowfall_cm  
0            0.721472                0.0                0.0  
1            0.721472                0.0                0.0  
2            0.721472                0.0                0.0  
3            0.721472                0.0                0.0  
4            0.721472                0.0                0.0  
In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Filter and prepare the data
combined_df['Creation Date'] = pd.to_datetime(combined_df['Creation Date'])
start_date = pd.to_datetime('2018-01-01')
end_date = pd.to_datetime('2025-03-31')
combined_df = combined_df[(combined_df['Creation Date'] >= start_date) & (combined_df['Creation Date'] <= end_date)]
print("Unique values in the 'Season' column:")
print(combined_df['Season'].unique())

# Step 2: Identify top 5 requests per season
seasonal_requests = combined_df.groupby(['Season', 'Service Request Type']).size().reset_index(name='Count')
top_requests_by_season = {}
seasons = ['Winter', 'Spring', 'Summer', 'Fall']
for season in seasons:
    season_data = seasonal_requests[seasonal_requests['Season'] == season]
    top_5 = season_data.nlargest(5, 'Count')
    top_requests_by_season[season] = top_5
for season in seasons:
    print(f"\nTop 5 Service Requests in {season}:")
    print(top_requests_by_season[season][['Service Request Type', 'Count']])

# Step 3: Plot the top 5 requests for each season
sns.set(style="whitegrid")
for season in seasons:
    data = top_requests_by_season[season]
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Count', y='Service Request Type', data=data, palette='viridis')
    plt.title(f'Top 5 Service Requests in {season} (2018–2025)', fontsize=14)
    plt.xlabel('Number of Requests', fontsize=12)
    plt.ylabel('Service Request Type', fontsize=12)
    plt.tight_layout()
    plt.show()
Unique values in the 'Season' column:
['Winter' 'Spring' 'Summer' 'Fall']

Top 5 Service Requests in Winter:
                         Service Request Type  Count
2450                          Road - Pot hole  23941
2409  Residential: Bin: Repair or Replace Lid  21916
2329        Publication Request - Solid Waste  20652
2324                       Property Standards  18785
2351            Res / Garbage / Not Picked Up  15227

Top 5 Service Requests in Spring:
                                 Service Request Type  Count
1066                                  Road - Pot hole  35207
1029          Residential: Bin: Repair or Replace Lid  28035
958                                Property Standards  21335
1026            Residential Furniture / Not Picked Up  13681
1028  Residential: Bin: Repair or Replace Body/Handle  13573

Top 5 Service Requests in Summer:
                         Service Request Type  Count
1649  Residential: Bin: Repair or Replace Lid  33857
1585                       Property Standards  29675
1427                          General Pruning  27111
1497                     Long Grass and Weeds  22751
1296                         CADAVER WILDLIFE  18964

Top 5 Service Requests in Fall:
                        Service Request Type  Count
384  Residential: Bin: Repair or Replace Lid  28943
321                       Property Standards  24650
38                          CADAVER WILDLIFE  16031
382    Residential Furniture / Not Picked Up  14644
41                        Cadaver - Wildlife  14287
/var/folders/07/3vs0pyfj68xd69bt_2bj8ln40000gn/T/ipykernel_3516/540139518.py:30: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Count', y='Service Request Type', data=data, palette='viridis')
No description has been provided for this image
/var/folders/07/3vs0pyfj68xd69bt_2bj8ln40000gn/T/ipykernel_3516/540139518.py:30: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Count', y='Service Request Type', data=data, palette='viridis')
No description has been provided for this image
/var/folders/07/3vs0pyfj68xd69bt_2bj8ln40000gn/T/ipykernel_3516/540139518.py:30: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Count', y='Service Request Type', data=data, palette='viridis')
No description has been provided for this image
/var/folders/07/3vs0pyfj68xd69bt_2bj8ln40000gn/T/ipykernel_3516/540139518.py:30: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Count', y='Service Request Type', data=data, palette='viridis')
No description has been provided for this image
In [25]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Prepare the data
combined_df['Creation Date'] = pd.to_datetime(combined_df['Creation Date'])
start_date = pd.to_datetime('2018-01-01')
end_date = pd.to_datetime('2025-03-31')
combined_df = combined_df[(combined_df['Creation Date'] >= start_date) & (combined_df['Creation Date'] <= end_date)]
combined_df['Date'] = combined_df['Creation Date'].dt.date
combined_df['Date'] = pd.to_datetime(combined_df['Date'])

# Step 2: Identify weekdays, weekends, and holidays
holidays = {
    '2018-01-01': 'New Year’s Day', '2018-02-19': 'Family Day', '2018-03-30': 'Good Friday',
    '2018-05-21': 'Victoria Day', '2018-07-02': 'Canada Day', '2018-09-03': 'Labour Day',
    '2018-10-08': 'Thanksgiving', '2018-11-12': 'Remembrance Day', '2018-12-25': 'Christmas Day',
    '2018-12-26': 'Boxing Day',
    '2019-01-01': 'New Year’s Day', '2019-02-18': 'Family Day', '2019-04-19': 'Good Friday',
    '2019-05-20': 'Victoria Day', '2019-07-01': 'Canada Day', '2019-09-02': 'Labour Day',
    '2019-10-14': 'Thanksgiving', '2019-11-11': 'Remembrance Day', '2019-12-25': 'Christmas Day',
    '2019-12-26': 'Boxing Day',
    '2020-01-01': 'New Year’s Day', '2020-02-17': 'Family Day', '2020-04-10': 'Good Friday',
    '2020-05-18': 'Victoria Day', '2020-07-01': 'Canada Day', '2020-09-07': 'Labour Day',
    '2020-10-12': 'Thanksgiving', '2020-11-11': 'Remembrance Day', '2020-12-25': 'Christmas Day',
    '2020-12-28': 'Boxing Day',
    '2021-01-01': 'New Year’s Day', '2021-02-15': 'Family Day', '2021-04-02': 'Good Friday',
    '2021-05-24': 'Victoria Day', '2021-07-01': 'Canada Day', '2021-09-06': 'Labour Day',
    '2021-10-11': 'Thanksgiving', '2021-11-11': 'Remembrance Day', '2021-12-27': 'Christmas Day',
    '2021-12-28': 'Boxing Day',
    '2022-01-03': 'New Year’s Day', '2022-02-21': 'Family Day', '2022-04-15': 'Good Friday',
    '2022-05-23': 'Victoria Day', '2022-07-01': 'Canada Day', '2022-09-05': 'Labour Day',
    '2022-10-10': 'Thanksgiving', '2022-11-11': 'Remembrance Day', '2022-12-26': 'Christmas Day',
    '2022-12-27': 'Boxing Day',
    '2023-01-02': 'New Year’s Day', '2023-02-20': 'Family Day', '2023-04-07': 'Good Friday',
    '2023-05-22': 'Victoria Day', '2023-07-03': 'Canada Day', '2023-09-04': 'Labour Day',
    '2023-10-09': 'Thanksgiving', '2023-11-13': 'Remembrance Day', '2023-12-25': 'Christmas Day',
    '2023-12-26': 'Boxing Day',
    '2024-01-01': 'New Year’s Day', '2024-02-19': 'Family Day', '2024-03-29': 'Good Friday',
    '2024-05-20': 'Victoria Day', '2024-07-01': 'Canada Day', '2024-09-02': 'Labour Day',
    '2024-10-14': 'Thanksgiving', '2024-11-11': 'Remembrance Day', '2024-12-25': 'Christmas Day',
    '2024-12-26': 'Boxing Day',
    '2025-01-01': 'New Year’s Day', '2025-02-17': 'Family Day', '2025-04-18': 'Good Friday',
}
holidays = {pd.to_datetime(date): name for date, name in holidays.items()}
def classify_day(row):
    date = row['Date']
    if date in holidays:
        return 'Holiday'
    if date.weekday() >= 5:
        return 'Weekend'
    return 'Weekday'
combined_df['Day_Type'] = combined_df.apply(classify_day, axis=1)
print("Sample of data with Day_Type:")
print(combined_df[['Date', 'Day_Type']].head())

# Step 3: Compute average daily calls
daily_calls = combined_df.groupby(['Date', 'Day_Type']).size().reset_index(name='Call_Count')
avg_daily_calls = daily_calls.groupby('Day_Type')['Call_Count'].mean().reset_index()
avg_daily_calls['Day_Type'] = pd.Categorical(avg_daily_calls['Day_Type'], categories=['Weekday', 'Weekend', 'Holiday'], ordered=True)
avg_daily_calls = avg_daily_calls.sort_values('Day_Type')
print("\nAverage daily calls by Day_Type:")
print(avg_daily_calls)

# Step 4: Create the bar graph
sns.set(style="whitegrid")
plt.figure(figsize=(8, 6))
sns.barplot(x='Day_Type', y='Call_Count', data=avg_daily_calls, palette='muted')
plt.title('Average Daily 311 Service Requests by Day Type (2018–2025)', fontsize=14)
plt.xlabel('Day Type', fontsize=12)
plt.ylabel('Average Daily Calls', fontsize=12)
plt.tight_layout()
plt.show()
Sample of data with Day_Type:
        Date Day_Type
0 2018-01-01  Holiday
1 2018-01-01  Holiday
2 2018-01-01  Holiday
3 2018-01-01  Holiday
4 2018-01-01  Holiday

Average daily calls by Day_Type:
  Day_Type   Call_Count
1  Weekday  1328.509499
2  Weekend   587.183333
0  Holiday   597.985294
/var/folders/07/3vs0pyfj68xd69bt_2bj8ln40000gn/T/ipykernel_3516/3918527136.py:68: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Day_Type', y='Call_Count', data=avg_daily_calls, palette='muted')
No description has been provided for this image
In [26]:
from sklearn.metrics import mean_absolute_error

# Compute additional error metrics
mae_rf = mean_absolute_error(y_test_vol_exp, y_pred_rf_exp)
rmse_rf = np.sqrt(mean_squared_error(y_test_vol_exp, y_pred_rf_exp))

print(f"Mean Absolute Error (MAE): {mae_rf:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_rf:.4f}")
Mean Absolute Error (MAE): 174.1699
Root Mean Squared Error (RMSE): 231.2838
In [27]:
from sklearn.metrics import precision_score, recall_score

# Compute additional classification metrics
precision_rf = precision_score(y_test_stat, y_pred_stat)
recall_rf = recall_score(y_test_stat, y_pred_stat)

print(f"Precision: {precision_rf:.4f}")
print(f"Recall: {recall_rf:.4f}")
Precision: 0.6008
Recall: 0.7215
In [28]:
import pandas as pd
import matplotlib.pyplot as plt

# Step 1: Prepare the data
combined_df['Creation Date'] = pd.to_datetime(combined_df['Creation Date'])
start_date = pd.to_datetime('2018-01-01')
end_date = pd.to_datetime('2025-03-31')
combined_df = combined_df[(combined_df['Creation Date'] >= start_date) & (combined_df['Creation Date'] <= end_date)]
if 'Division' not in combined_df.columns:
    raise KeyError("The 'Division' column is not found in combined_df. Please check the column name (e.g., 'Section - Unit', 'Department') and adjust the code.")
print("Unique values in the 'Division' column:")
print(combined_df['Division'].unique())

# Step 2: Compute proportions
division_counts = combined_df['Division'].value_counts()
total_requests = division_counts.sum()
division_proportions = division_counts / total_requests
division_df = pd.DataFrame({
    'Division': division_counts.index,
    'Count': division_counts.values,
    'Proportion': division_proportions.values
})
division_df = division_df.sort_values(by='Count', ascending=False)
print("\nRequest counts and proportions by division:")
print(division_df)

# Step 3: Create the pie chart
labels = division_df['Division']
sizes = division_df['Proportion']
colors = plt.cm.Paired(range(len(labels)))
explode = [0.05] * len(labels)
plt.figure(figsize=(8, 12))
plt.pie(sizes, explode=explode, labels=None, colors=colors, autopct='%1.1f%%', startangle=140)
plt.title('Proportion of 311 Service Requests by Division (2018–2025)', fontsize=14)
plt.legend(labels, title="Division", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1))
plt.axis('equal')
plt.show()
Unique values in the 'Division' column:
['Municipal Licensing & Standards' 'Toronto Water'
 'Transportation Services' '311' 'Solid Waste Management Services'
 'Urban Forestry' 'City of Toronto' 'Unknown' 'Parks']

Request counts and proportions by division:
                          Division    Count  Proportion
0  Solid Waste Management Services  1028897    0.371301
1  Municipal Licensing & Standards   645772    0.233042
2          Transportation Services   634676    0.229038
3                    Toronto Water   249630    0.090085
4                   Urban Forestry   122159    0.044084
5                              311    53611    0.019347
6                            Parks    20532    0.007409
7                          Unknown    15327    0.005531
8                  City of Toronto      452    0.000163
No description has been provided for this image
In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (mean_absolute_error, mean_squared_error, r2_score, 
                             accuracy_score, f1_score, precision_score, recall_score)

# Set random seed for reproducibility
np.random.seed(42)

# Load cleaned data
combined_df = pd.read_csv("data/SR_all_years_cleaned.csv")
combined_df['Creation Date'] = pd.to_datetime(combined_df['Creation Date'])
combined_df = combined_df[(combined_df['Creation Date'] >= '2018-01-01') & 
                          (combined_df['Creation Date'] <= '2025-03-31')]

# Simulate weather data (from your earlier script)
dates = pd.date_range(start='2018-01-01', end='2025-03-31', freq='D')  # Extended to match data
weather_data = {
    'Date': dates,
    'Mean_Temperature_C': np.random.normal(loc=10, scale=10, size=len(dates)),
    'Total_Rainfall_mm': np.random.exponential(scale=2, size=len(dates)),
    'Total_Snowfall_cm': np.zeros(len(dates))
}
weather_df = pd.DataFrame(weather_data)
weather_df['Month'] = weather_df['Date'].dt.month
monthly_rainfall = {1: 54, 2: 19, 3: 80, 4: 80, 5: 48, 6: 81, 7: 53, 8: 75, 9: 78, 10: 78, 11: 75, 12: 54}
for month in range(1, 13):
    if month in [1, 2, 12]:
        weather_df.loc[weather_df['Month'] == month, 'Mean_Temperature_C'] = np.random.normal(loc=-2.3, scale=3, size=len(weather_df[weather_df['Month'] == month]))
    elif month in [6, 7, 8]:
        weather_df.loc[weather_df['Month'] == month, 'Mean_Temperature_C'] = np.random.normal(loc=24.8, scale=3, size=len(weather_df[weather_df['Month'] == month]))
    elif month in [3, 4, 5]:
        weather_df.loc[weather_df['Month'] == month, 'Mean_Temperature_C'] = np.random.normal(loc=12, scale=5, size=len(weather_df[weather_df['Month'] == month]))
    else:
        weather_df.loc[weather_df['Month'] == month, 'Mean_Temperature_C'] = np.random.normal(loc=15, scale=5, size=len(weather_df[weather_df['Month'] == month]))
    days_in_month = weather_df[weather_df['Month'] == month].shape[0]
    monthly_total = monthly_rainfall[month]
    daily_avg = monthly_total / (days_in_month / 2)
    rain_days = np.random.choice([0, 1], size=days_in_month, p=[0.5, 0.5])
    weather_df.loc[weather_df['Month'] == month, 'Total_Rainfall_mm'] = rain_days * np.random.exponential(scale=daily_avg, size=days_in_month)
for month in range(1, 13):
    days_in_month = weather_df[weather_df['Month'] == month].shape[0]
    if month in [11, 12, 1, 2, 3, 4]:
        monthly_snow = 57 if month == 1 else 223 / 6
        daily_avg = monthly_snow / (days_in_month / 4)
        snow_days = np.random.choice([0, 1], size=days_in_month, p=[0.75, 0.25])
        weather_df.loc[weather_df['Month'] == month, 'Total_Snowfall_cm'] = snow_days * np.random.exponential(scale=daily_avg, size=days_in_month)
    else:
        weather_df.loc[weather_df['Month'] == month, 'Total_Snowfall_cm'] = 0
weather_df = weather_df.drop(columns=['Month'])

# Merge weather data with combined_df
combined_df['Date'] = combined_df['Creation Date'].dt.date
combined_df['Date'] = pd.to_datetime(combined_df['Date'])
combined_df = combined_df.merge(weather_df, on='Date', how='left')

# Filter out 2025 for training (train on 2018-2023, test on 2024)
train_df = combined_df[combined_df['Year'] != 2025].copy()

# Volume Prediction: Prepare Data
volume_df = train_df.groupby(['Year', 'Month', 'Season', 'Ward']).size().reset_index(name='Request_Count')
for col in ['Season', 'Ward']:
    volume_df[col] = LabelEncoder().fit_transform(volume_df[col])

# Create a temporary date column for merging weather data
volume_df['Date'] = pd.to_datetime(volume_df[['Year', 'Month']].assign(day=1))
weather_subset = train_df[['Date', 'Mean_Temperature_C', 'Total_Rainfall_mm', 'Total_Snowfall_cm']].drop_duplicates()

# Merge weather data and drop the Date column
volume_df = volume_df.merge(weather_subset, on='Date', how='left')
X_vol = volume_df[['Year', 'Month', 'Season', 'Ward', 'Mean_Temperature_C', 'Total_Rainfall_mm', 'Total_Snowfall_cm']]
y_vol = np.log1p(volume_df['Request_Count'])

# Split data
X_train_vol, X_test_vol, y_train_vol, y_test_vol = train_test_split(X_vol, y_vol, test_size=0.3, random_state=42)

# Volume Prediction: Random Forest Regressor
rf_reg = RandomForestRegressor(n_estimators=200, max_depth=15, min_samples_split=2, random_state=42, n_jobs=-1)
rf_reg.fit(X_train_vol, y_train_vol)
y_pred_rf = np.expm1(rf_reg.predict(X_test_vol))
y_test_vol_exp = np.expm1(y_test_vol)

# Volume Evaluation
mae_rf = mean_absolute_error(y_test_vol_exp, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test_vol_exp, y_pred_rf))
r2_rf = r2_score(y_test_vol_exp, y_pred_rf)
print("\nVolume Prediction (Random Forest):")
print(f"R² Score: {r2_rf:.4f}")
print(f"Mean Absolute Error (MAE): {mae_rf:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_rf:.4f}")

# Plot Actual vs Predicted Volumes
plt.figure(figsize=(10, 6))
plt.scatter(y_test_vol_exp, y_pred_rf, alpha=0.5, color='purple')
plt.plot([y_test_vol_exp.min(), y_test_vol_exp.max()], [y_test_vol_exp.min(), y_test_vol_exp.max()], 'r--')
plt.title('Actual vs Predicted 311 Request Volumes, (Random Forest)', fontsize=14)
plt.xlabel('Actual Request Count', fontsize=12)
plt.ylabel('Predicted Request Count', fontsize=12)
plt.tight_layout()
plt.savefig('figure1_volume.png')  # Save for report
plt.show()

# Status Prediction: Prepare Data
for col in ['Season', 'Service Request Type', 'Division', 'Ward']:
    train_df[col] = LabelEncoder().fit_transform(train_df[col])
train_df['Status'] = train_df['Status'].apply(lambda x: 1 if x == 'Closed' else 0)
X_stat = train_df[['Season', 'Service Request Type', 'Division', 'Ward', 'Mean_Temperature_C', 'Total_Rainfall_mm', 'Total_Snowfall_cm']]
y_stat = train_df['Status']
X_train_stat, X_test_stat, y_train_stat, y_test_stat = train_test_split(X_stat, y_stat, test_size=0.3, random_state=42)

# Status Prediction: Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_split=2, class_weight='balanced', random_state=42, n_jobs=-1)
rf_clf.fit(X_train_stat, y_train_stat)
y_pred_stat = rf_clf.predict(X_test_stat)

# Status Evaluation
accuracy_rf = accuracy_score(y_test_stat, y_pred_stat)
f1_rf = f1_score(y_test_stat, y_pred_stat)
precision_rf = precision_score(y_test_stat, y_pred_stat)
recall_rf = recall_score(y_test_stat, y_pred_stat)
print("\nStatus Prediction (Random Forest):")
print(f"Accuracy: {accuracy_rf:.4f}")
print(f"F1-Score: {f1_rf:.4f}")
print(f"Precision: {precision_rf:.4f}")
print(f"Recall: {recall_rf:.4f}")

# Feature Importance for Status Prediction
feat_importance = pd.Series(rf_clf.feature_importances_, index=X_stat.columns).sort_values(ascending=False)
print("\nFeature Importance (Status Prediction):")
print(feat_importance)


# Save updated data for reference
train_df.to_csv("data/SR_train_with_weather.csv", index=False)
print("\nUpdated training data saved as 'data/SR_train_with_weather.csv'")
Volume Prediction (Random Forest):
R² Score: 0.8253
Mean Absolute Error (MAE): 145.0985
Root Mean Squared Error (RMSE): 192.9323
No description has been provided for this image
Status Prediction (Random Forest):
Accuracy: 0.7306
F1-Score: 0.7263
Precision: 0.6893
Recall: 0.7674

Feature Importance (Status Prediction):
Service Request Type    0.404704
Mean_Temperature_C      0.184243
Division                0.148536
Total_Rainfall_mm       0.116991
Total_Snowfall_cm       0.078485
Ward                    0.038405
Season                  0.028635
dtype: float64

Updated training data saved as 'data/SR_train_with_weather.csv'
In [ ]: